/*
	Author: Jacobus Cilliers
	Purpose: Replication code for language transfer paper 
*/

***********************
***	PRELIMINARIES	***
***********************


if c(username) == "jacob" { 
	global root "C:\Users\jacob\Dropbox\EGRS II Data\Language Transition Paper\replication\" 
	}

do "$root\do files\0 master.do"	


************************************************************
***	TABLE 1. Timeline	***
************************************************************

*manually entered. 

************************************************************
***	TABLE 1. Data description	***
************************************************************

*manually entered. 


************************************************************
***	TABLE 3 AND FIGURE 1	***
************************************************************

use "$EGRSI", clear
gen egrs = 1
*Comprehension as mean
gen W3_LT_eng_score_mean = W3_LT_eng_score / 8				
*Rename assessment instruments. 
ren gr4_la_tsk_6_tot gr4_efal_words
lab var gr4_efal_words "Word Recognition"
ren gr4_la_tsk_7a_tot gr4_efal_orf
lab var gr4_efal_orf "Oral Reading Fluency (ORF)"
ren gr4_la_tsk_7b_mean gr4_efal_orf_compr
lab var gr4_efal_orf_compr "ORF Comprehension"
ren gr4_wa_engl_mean gr4_efal_written_compr
lab var gr4_efal_written_compr "Written Compr."
ren gr4_la_tsk_4a_tot gr4_hl_orf
lab var gr4_hl_orf "Oral Reading Fluency (ORF)"
ren gr4_la_tsk_4b_mean gr4_hl_orf_compr
lab var gr4_hl_orf_compr "ORF Comprehension"
ren gr4_wa_home_mean gr4_hL_written_compr  
lab var gr4_hL_written_compr "Written Comprehension"
ren W3_LT_score_D1 gr2_hl_orf
*Create binary benchmark variables
gen gr4_hl_benchmark = gr4_hl_orf >= 60
replace gr4_hl_benchmark= . if gr4_hl_orf  == .
gen gr4_efal_benchmark = gr4_efal_orf >= 70
replace gr4_efal_benchmark = . if gr4_efal_orf == .
gen gr2_hl_benchmark = gr2_hl_orf >= 60 //Question: should we use 40 or 60? 
replace gr2_hl_benchmark = . if gr2_hl_orf == .
*Create binary non-reader variables. 
gen gr4_efal_nonreader = gr4_efal_orf == 0
replace gr4_efal_nonreader = . if gr4_efal_orf == .
gen gr2_hl_nonreader = gr2_hl_orf == 0
replace gr2_hl_nonreader = . if gr2_hl_orf == .
gen gr4_hl_nonreader = gr4_hl_orf == 0
replace gr4_hl_nonreader = . if gr4_hl_orf == .

*only keep control observations
keep if treatment == 0 
*Only keep relevant variables
keep gr4_* gr2_* egrs idschool
tempfile egrs1
save `egrs1'

use "$EGRSII", clear

keep if treatment == 1 | treatment == .
gen egrs = 2 
*To make the words read comparable we need to drop *one* point for someone who actually got the 105th word. 
replace w5_LA_ss_2_1 = w5_LA_ss_2_1 - 1 if w5_LA_eng_words_105 == 1
*Create mean values for comprehension
foreach var of varlist w5_LA_ss_1_3 w5_LA_ss_6 w5_LA_ss_3_3 w5_LA_ss_8 {
	qui sum `var'
	gen `var'_mean = `var'/`r(max)'
}
*Same variable name for the same assessment insturment, across both studies. 
ren w5_LA_ss_2_1   		gr4_efal_words
ren w5_LA_ss_3_1   		gr4_efal_orf
ren w5_LA_ss_3_3_mean  	gr4_efal_orf_compr
ren w5_LA_ss_8_mean    	gr4_efal_written_compr
ren w5_LA_ss_1_1  		gr4_hl_orf
ren w5_LA_ss_1_3_mean  	gr4_hl_orf_compr
ren w5_LA_ss_6_mean  	gr4_hL_written_compr
ren w3_LA_ss_4_1 		gr2_hl_orf
ren w4_LA_ss_3_1 		gr3_hl_orf
*Create binary benchmark variables
gen gr4_hl_benchmark = gr4_hl_orf >= 35
replace gr4_hl_benchmark= . if gr4_hl_orf  == .
gen gr4_efal_benchmark = gr4_efal_orf >= 70
replace gr4_efal_benchmark = . if gr4_efal_orf == .
gen gr2_hl_benchmark = gr2_hl_orf >= 35 //Question: should we use 20 or 35? 
replace gr2_hl_benchmark = . if gr2_hl_orf == .
gen gr3_hl_benchmark = gr3_hl_orf  >= 35
replace gr3_hl_benchmark = . if gr3_hl_orf == .
*Create binary non-reader variables. 
gen gr4_efal_nonreader = gr4_efal_orf == 0
replace gr4_efal_nonreader = . if gr4_efal_orf == .
gen gr3_hl_nonreader = gr3_hl_orf == 0
replace gr3_hl_nonreader = . if gr3_hl_orf == .
gen gr2_hl_nonreader = gr2_hl_orf == 0
replace gr2_hl_nonreader = . if gr2_hl_orf == .
gen gr4_hl_nonreader = gr4_hl_orf == 0
replace gr4_hl_nonreader = . if gr4_hl_orf == .



*Same schoolID
ren NatEmis idschool
*Only keep relevant variables
keep gr4_* gr2_* gr3_* egrs idschool

append using `egrs1'

lab def egr 1 "EGRS I" 2 "EGRS II"
lab val egrs egr
lab var gr4_hl_benchmark "Grade 4 (HL)"
lab var gr3_hl_benchmark "Grade 3 (HL)"
lab var gr2_hl_benchmark "Grade 2 (HL)"
lab var gr4_efal_benchmark "Grade 4 (Eng.)"

lab var gr2_hl_nonreader "Grade 2 (L1)"
lab var gr4_hl_nonreader "Grade 4 (L1)"
lab var gr3_hl_nonreader "Grade 3 (L1)"
lab var gr4_efal_nonreader "Grade 4 (Eng.)"


	statplot gr2_hl_nonreader gr3_hl_nonreader gr4_hl_nonreader gr4_efal_nonreader, over(egrs) ///
		yscale(r(0 0.4)) ylab(, labsize(small)) ///
		title("") ///
		asyvars  ///
		ytitle("Proportion of students")  ///
		blabel(bar, format(%9.2f) size(small)) ///
		graphregion(color(white)) plotregion(color(white))   aspectratio(1) ///
		legend(pos(6) col(3) size(small)) vertical
		graph export "$figures\figure_1.png", replace
		
		

iebaltab $assmmts, ///
	grpvar(egrs) save("$tables/table_3.xlsx")  ///
	replace rowvarlabels  vce(cluster idschool)
		


***************
***	EGRS I	***
***************
use "$EGRSI", clear

*Restrict to coaching and control
keep if treatment == 0 | treatment == 2
*Label treatment variable
lab var T2 "Treatment"
	
	**************************
	**Table A2.1. Panel A. Balance (EGRS I) **
	**************************

	cap erase "$tables/balance_egrs1.xls"
	cap erase "$tables/balance_egrs1.txt"
	
	foreach var of varlist age_best_W1and2 boy BL_totscore_SD {
		qui reg `var' T2   i.strata, cl(W2_idschool)
		outreg2 using "$tables/balance_egrs1.xls", append keep(T2) label dec(3) nocons nonotes nodepvars
		}	
	foreach var of varlist age_best_W1and2 boy BL_totscore_SD {
		qui reg `var' T2  i.strata if W4_attrite  == 0, cl(W2_idschool)
		outreg2 using "$tables/balance_egrs1.xls", append keep(T2) label dec(3) nocons nonotes nodepvars
	}
	

	**************************
	**Table A2.1. Panel A. Attrition (EGRS I) **
	**************************
	qui reg W4_attrite i.strata T2, cl(W2_idschool)
		qui sum W4_attrite if treatment == 0
		local m = `r(mean)'
		eststo attrite, addscalars(m `m')
		outreg2 using "$tables/attrition_egrs1.xls", replace keep(T2 ) label dec(3) nocons nonotes nodepvars adds(Mean attrition, `m')
	
		eststo clear
	foreach var of varlist age_best_W1and2 boy BL_totscore_SD  {
		qui reg `var' T2  W4_attrite W4_attritexT2 i.strata, cl(W2_idschool)
		outreg2 using "$tables/attrition_egrs1.xls", append keep(T2  W4_attrite W4_attritexT2) label dec(3) nocons nonotes nodepvars
			}	
	

	****************************************************
	**Table 4. Estiamtes taken from other studies **
	****************************************************
	
	***********************
	**C. Figure 2 Treatment Effects by study and year**
	***********************	
eststo clear
foreach var of varlist W2_LT_totscore_SD W3_LT_totscore_SD {
	qui reg `var' T2 $controls1 i.strata , cl(W2_idschool)	
	eststo `var'
}
	set scheme plotplain	
	coefplot W2_LT_totscore_SD W3_LT_totscore_SD ///
			, keep(T2) vertical yline(0) ci(90) mlab format(%9.2g) mlabsize(medium) ///
			coeflabels(T2= " ", labsize(tiny)) ytitle("Standard Deviations") ///
				legend(label(2 "Year 1") label(4 "Year 2") position(6) rows(1) size(medium))	
	graph export "$figures\Figure2_c.png", replace
	
	
	
	*********************************************************************
	**Table 5. PAnel A. Impacts on home language and English literacy **
	*********************************************************************
	
foreach var of varlist 	gr4_la_tsk_4a_tot gr4_la_tsk_4b_mean   /// HL ORF, ORF comprehension, written comprehension, 
						 gr4_la_tsk_7a_tot gr4_la_tsk_7b_mean   /// English  letters, ORF, ORF comprehension, written comprehension, 
			{
			reg `var' T2 $controls1 i.strata , cl(W2_idschool)		
			sum `var' if treatment == 0
			local m = `r(mean)'
			eststo `var', addscalars(m `m')
			}
				
	esttab 	 	gr4_la_tsk_4a_tot gr4_la_tsk_4b_mean   /// HL ORF, ORF comprehension, 
				 gr4_la_tsk_7a_tot gr4_la_tsk_7b_mean   /// English  ORF, ORF comprehension, 
				using "$tables/table5.csv", replace ///
				label se(3) b(3) alignment(cl) r2 ///
				stats(m N r2,  fmt(3 0 3) labels("Control mean" "Observations" "R-squared" )) ///	
				keep( T2) ///
				collabels(none)	 ///
				mgroups("Home Language" "English" , pattern(1 0 1 0)) ///
				mtitles("(ORF"	"Reading compr." "ORF"	"Reading compr.") ///
				nodepvars nonotes  star(  * 0.1  **  0.05 *** 0.01) 							
			

			
	**************************
	**Table 6. Mediation Analysis **
	**************************
	*The role of HL grade 2 literacy in mediating the impact of gr4 English literacy. 				
		
	qui reg gr4_la_tsk_7a_tot T2 $controls1 i.strata if W3_LT_score_A ~= ., cl(W2_idschool)		
	outreg2 using "$tables/table6.xls", replace keep(T2 ) label dec(3) nocons nonotes nodepvars

	foreach var of varlist W3_LT_score_A W3_LT_score_B W3_LT_score_D1 W3_LT_score_D2 {
		qui reg gr4_la_tsk_7a_tot T2 `var' $controls1 i.strata , cl(W2_idschool)
		outreg2 using "$tables/table6.xls", append keep(T2  `var') label dec(3) nocons nonotes nodepvars

	}		
 

***************
***	EGRS II	***
***************

use "$EGRSII", clear
*Restrict to onsite coaching and control
keep if treatment == 1 | treatment == 2
*Label treatment variable
lab var w1_S_T1 "Treatment"

	**************************
	**Table A.2.1 Panel B. Balance **
	**************************
			
			
	cap erase "$tables/balance_egrs2.xls"
	cap erase "$tables/balance_egrs2.txt"
	
	foreach var of varlist w1_LA_best_age w1_LA_learner_boy w1_LA_is_SD1 {
		qui reg `var' w1_S_T1   i.strata, cl(NatEmis)
		outreg2 using "$tables/balance_egrs2.xls", append keep(w1_S_T1) label dec(3) nocons nonotes nodepvars
		}	
	foreach var of varlist w1_LA_best_age w1_LA_learner_boy w1_LA_is_SD1 {
		qui reg `var' w1_S_T1  i.strata if w5_attrit  == 0, cl(NatEmis)
		outreg2 using "$tables/balance_egrs2.xls", append keep(w1_S_T1) label dec(3) nocons nonotes nodepvars
	}
				
	




	**************************************
	**Table A.2.2. Panel B. Attrition **
	**************************************

	qui reg w5_attrit i.strata w1_S_T1, cl(NatEmis)
		qui sum w5_attrit if treatment == 1
		local m = `r(mean)'
		eststo attrite, addscalars(m `m')
		outreg2 using "$tables/attrition_egrs2.xls", replace keep(w1_S_T1 ) label dec(3) nocons nonotes nodepvars adds(Mean attrition, `m')

		eststo clear
	foreach var of varlist w1_LA_best_age w1_LA_learner_boy w1_LA_is_SD1  {
		qui reg `var' w1_S_T1 w5_attrit   w5_attritexT1  i.strata, cl(NatEmis)
		outreg2 using "$tables/attrition_egrs2.xls", append keep(w1_S_T1  w5_attrit w5_attritexT1 ) label dec(3) nocons nonotes nodepvars
			}

	
	*********************************************************************
	**Table 5. PAnel B. Impacts on Home langauge and English literacy  **
	*********************************************************************

*Create mean values for comprehension
foreach var of varlist w5_LA_ss_1_3 w5_LA_ss_6 w5_LA_ss_3_3 w5_LA_ss_8 { //Divide by 8, 6, 8, 7. In each case someone got the maximum amount. 
	qui sum `var'
	gen `var'_mean = `var'/`r(max)'
}

	
foreach var of varlist 	w5_LA_ss_1_1   	w5_LA_ss_1_3_mean  /// HL ORF, ORF comprehension, 
						  w5_LA_ss_3_1 	w5_LA_ss_3_3_mean  ///  ORF, ORF comprehension,  
			{
			qui reg `var' w1_S_T1 $controls2, cl(NatEmis)		
			sum `var' if treatment == 1 
			local m = `r(mean)'
			eststo `var', addscalars(m `m')
			}
				
	esttab 	 w5_LA_ss_1_1  	w5_LA_ss_1_3_mean  /// HL ORF, ORF comprehension, written comprehension, 
				w5_LA_ss_3_1     w5_LA_ss_3_3_mean  /// English  ORF, ORF comprehension, written comprehension, 
				using "$tables/table5.csv", append ///
				label se(3) b(3) alignment(cl) r2 substitute(\_ _ $ \$ ) ///
				stats(m N r2,  fmt(3 0 3) labels("Control mean" "Observations" "R-squared" )) ///	
				keep( w1_S_T1) ///
				collabels(none)	 ///
				 nomtitles ///
				nodepvars nonotes  star(  * 0.1  **  0.05 *** 0.01) 							


	*****************************
	** Figures 2(a) and 2(b)   **
	*****************************	


use "$EGRSII", clear
*Restrict to onsite coaching and control
keep if treatment == 1 | treatment == 2
*Label treatment variable
lab var w1_S_T1 "Treatment"

						** Some more data construction **
	
	** PRIMARY OUTCOMES: mean indices for: 
	** (1) English Language Proficiency: Listening Comprehension & Vocabulary
	** (2) English Decoding: Word recognition, ORF, ORF Comprehension, Written Comprehension
	
	pca w4_LA_ss_6 w4_LA_ss_7 if treat == 1, comp(1)
	predict w4_LA_OralProf
	
	pca w4_LA_ss_4 w4_LA_ss_5_1 w4_LA_ss_5_3 w4_LA_ss_9 if treat == 1, comp(1)
	predict w4_LA_Decoding
	
	order w4_LA_OralProf w4_LA_Decoding, after(w4_LA_ss_10)
	
	** SECONDARY OUTCOMES: mean indices for (1) Home Language Reading; (2) Mathematics
	pca w4_LA_ss_2 w4_LA_ss_3_1 w4_LA_ss_3_3 w4_LA_ss_8 if treat == 1, comp(1)
	predict w4_LA_HLProf

	order w4_LA_HLProf, after(w4_LA_Decoding)
	
	
	foreach var of varlist w4_LA_ss_1_1 - w4_LA_HLProf  {
		sum `var' if treat == 1
		gen Z_`var' = (`var' - `r(mean)')/`r(sd)'
		}

	**Create three different mean indeces for for each wave of data collection: for English reading and vocabulary, and HL reading. And make names comparable
	pca w3_LA_ss_5* w3_LA_ss_6*  if treat == 1, comp(1)
	predict w3_LA_Decoding
	pca w3_LA_ss_7 w3_LA_ss_8 w3_LA_ss_9 if treat == 1, comp(1)
	predict w3_LA_OralProf
	pca w3_LA_ss_3 w3_LA_ss_4*  if treat == 1, comp(1)
	predict w3_LA_HLProf
	
	*New decoding and HL index for w2 (We alread have a vocab and ORF index)
	pca w2_LA_ss_3 w2_LA_ss_41  w2_LA_ss_511 w2_LA_ss_521 if treat == 1, comp(1) //perhaps we should not inclde 41 "letter recognition"
	predict w2_LA_Decoding
	ren w2_LA_is_LS w2_LA_OralProf
	/*pca  w2_LA_ss_71 w2_LA_ss_81 if treat == 1, comp(1)
	predict w2_LA_OralProf*/
	pca w2_LA_ss_1 w2_LA_ss_21  if treat == 1, comp(1)
	predict w2_LA_HLProf
		

	**Include weights, so each school gets an equal weight. Some schools have ~10 students, some have ~10. We want to give each school equal weight. Repeat for each wave. 
	bys NatEmis: egen num_pup = count(w4_LA_OralProf)
	bys NatEmis: egen w4_num_pup = count(w4_LA_OralProf)	
	bys NatEmis: egen w3_num_pup = count(w3_LA_OralProf)	
	bys NatEmis: egen w2_num_pup = count(w2_LA_Decoding )	

	
	*Standardize these names
	foreach var of varlist w2_LA_OralProf w2_LA_Decoding w3_LA_Decoding w3_LA_OralProf w3_LA_HLProf w2_LA_HLProf {
		sum `var' if treat == 1
		gen Z_`var' = (`var' - `r(mean)')/`r(sd)'
		}	
	
	**Dynamics: INdeces
	egen non_mis =  rowmiss(Z_w?_LA_OralProf)
	eststo clear	
	foreach name in LA_OralProf LA_Decoding {
		forvalues i = 2/4 {
				qui reg Z_w`i'_`name' w1_S_T1 ///
					w1_LA_learner_boy w1_LA_best_age w5_LA_Language_Zulu w1_LA_ss_1 w1_LA_ss_2_1 w1_LA_ss_2_2 w1_LA_ss_3 w1_LA_ss_4 w1_LA_ss_51 w1_LA_ss_9 i.w1_S_strata ///
					i.w`i'_LA_fieldworker [pweight = 1/w`i'_num_pup] if non_mis == 0, cl(NatEmis)
				eststo Z_w`i'_`name'
			}
		}
	set scheme plotplain	
	coefplot Z_w2_LA_Decoding Z_w3_LA_Decoding Z_w4_LA_Decoding  ///Decode
			, keep(w1_S_T1) vertical yline(0) ci(90) mlab format(%9.2g) mlabsize(medium)  ///
			coeflabels(w1_S_T1= " ", labsize(tiny)) ytitle("Standard Deviations") ///
				legend(label(2 "Year 1") label(4 "Year 2") label(6 "Year 3") position(6) rows(1) size(medium))	
	graph export "$figures\Figure2_b.png", replace

	coefplot Z_w2_LA_OralProf  Z_w3_LA_OralProf Z_w4_LA_OralProf   ///Vocab
			, keep(w1_S_T1) vertical yline(0) ci(90) mlab format(%9.2g) mlabsize(medium) ///
			coeflabels(w1_S_T1= " ", labsize(tiny)) ytitle("Standard Deviations") ///
				legend(label(2 "Year 1") label(4 "Year 2") label(6 "Year 3") position(6) rows(1) size(medium))	
	graph export "$figures\Figure2_a.png", replace
	
 		

	
		
	***********************
	**Figure 3. Quantile regressions **
	***********************						
	
ren w4_LA_ss_2 gr3_hl_letterrecog
ren w4_LA_ss_3_1  gr3_hl_orf
ren w4_LA_ss_4 gr3_efal_wordrecog
ren w4_LA_ss_5_1 gr3_efal_orf
	
foreach var of varlist  gr3_hl_letterrecog gr3_hl_orf gr3_efal_wordrecog gr3_efal_orf {
	* Define regression options
	local quantileInterval = 0.1
	local statSignLevel    = 90	
	*loop on the variables
	* Initiate locals (to be filled in the next loop)
	local quantileLabels   = ""		//x-axis label
	local modelsList 	   = ""		//estimates name
	local quantileCount    = 1		//count of quantile regression
	* Loop on quantiles
	forv  quantile 		   = `quantileInterval'(`quantileInterval')`=1-`quantileInterval'' {
			
		local    roundQuantile =  round(real(string(`quantile'*100, "%4.0f")),10)/100 //this is needed, otherwise Stata gives issue with the last numbers of the sequence and with rounding
		local  integerQuantile = `roundQuantile'*100
			
		* Store estimates from quantile regression with fixed effects and clustered standard errors
		eststo est_q`integerQuantile': 									///
		   qui qreg2 `var' w1_S_T1 ///
			 , q(`roundQuantile') cl(NatEmis)
			
		* Add these estimate to locals
		local  quantileLabels `"`quantileLabels' `quantileCount++' "`integerQuantile'"	"'
		local  modelsList 	  `"`modelsList' 						  est_q`integerQuantile' || "'
	}
		coefplot `modelsList', keep(w1_S_T1 w1_S_T2) ///
			vertical bycoefs msymbol(diamond) mcolor(black) 	///
			levels(`statSignLevel') recast(line) lwidth(*2)	lcolor(black) ciopts(recast(rline) lcolor(black*0.75) lpattern(dash))				///
			   title("")  ytitle("Number of words") ///
			   yline(0, lstyle(foreground)) xlab(none) xlab(`quantileLabels', add)	legend(off) graphregion(color(white))
		graph export "$figures/Figure3_`var'.png", replace 
	}


foreach var of varlist   gr3_hl_letterrecog {
	* Define regression options
	local quantileInterval = 0.1
	local statSignLevel    = 90	
	*loop on the variables
	* Initiate locals (to be filled in the next loop)
	local quantileLabels   = ""		//x-axis label
	local modelsList 	   = ""		//estimates name
	local quantileCount    = 1		//count of quantile regression
	* Loop on quantiles
	forv  quantile 		   = `quantileInterval'(`quantileInterval')`=1-`quantileInterval'' {
			
		local    roundQuantile =  round(real(string(`quantile'*100, "%4.0f")),10)/100 //this is needed, otherwise Stata gives issue with the last numbers of the sequence and with rounding
		local  integerQuantile = `roundQuantile'*100
			
		* Store estimates from quantile regression with fixed effects and clustered standard errors
		eststo est_q`integerQuantile': 									///
		   qui qreg2 `var' w1_S_T1 ///
			 , q(`roundQuantile') cl(NatEmis)
			
		* Add these estimate to locals
		local  quantileLabels `"`quantileLabels' `quantileCount++' "`integerQuantile'"	"'
		local  modelsList 	  `"`modelsList' 						  est_q`integerQuantile' || "'
	}
		coefplot `modelsList', keep(w1_S_T1 w1_S_T2) ///
			vertical bycoefs msymbol(diamond) mcolor(black) 	///
			levels(`statSignLevel') recast(line) lwidth(*2)	lcolor(black) ciopts(recast(rline) lcolor(black*0.75) lpattern(dash))				///
			   title("")  ytitle("Number of letters") ///
			   yline(0, lstyle(foreground)) xlab(none) xlab(`quantileLabels', add)	legend(off) graphregion(color(white))
		graph export "$figures/Figure3_`var'.png", replace 
	}


		
